# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
#
#     Paper: Exposure to worrisome topics can increase cognitive performance
#     Date: September 2023
#
#     Preliminary version at submission state
#     database used: worries_cognitive_performance.dta
#     output: table 1
#
#
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

## !!!!!!!!!!!!!! add your working directory here ******************************
##### Working directory structure:
##### Folders "Data"; "Codes"; "Output"
##### Data folder contains the raw dataset "base_anonym_AMU.xlsx"
##### Output folder contains following subfolders: "main figures"; "main tables"; "SI figures"; "SI tables"


library(devtools)
#install_github("susanathey/causalTree")
library(causalTree)
library(dplyr)
library(haven)
library(tidyr)
library(grf)
library(RCT)
library(stargazer)
library(sandwich)
library(lmtest)
library(ggplot2)
library(RColorBrewer)
library(MASS)
library(xtable)


set.seed(123)
setwd("C:/Users/raiber.e/Dropbox/Stress and cognitive ability of students covid/Submissions/Scientific Reports/Replication File")
table_path <- "C:/Users/raiber.e/Dropbox/Stress and cognitive ability of students covid/Submissions/Scientific Reports/Replication File/Output/main table"

dt.het <-  read_dta("Data/worries_cognitive_performance.dta")
# Generate treatment variables
dt.het <- data.frame(dt.het)

dt.het <- dt.het %>% mutate(treat_econ = ifelse(treat1==1, 1, 0),
                            treat_social = ifelse(treat1==2, 1, 0), 
                            treat_control = ifelse(treat1==3, 1, 0),
                            threshold = ifelse(payment==0,1, 0))  %>% 
  filter(exclusions==0 & finish==1)


dt.het <- dt.het %>% mutate( bourse= ifelse(bourse!=2, bourse, 0),
                             solve_extra_expenses= ifelse(solve_extra_expenses!=2, solve_extra_expenses, 0),
                             fatigue_level=ifelse(fatigue<=3, 1, 0),
                             selfcov= ifelse(selfcov!=2, selfcov, 0),
                             father_university= ifelse(father_diplome==1, 1, 0),
                             mother_university= ifelse(mother_diplome==1, 1, 0),
                             positive_vaccine= ifelse(vaccine==1 | vaccine==2,1,0))

# Function for the P-value stars
star <- function(x){
  out <- ifelse(x <= 0.1, ifelse(x <= 0.05, ifelse(x <= 0.01, "***", "**"), '*'), "")
  out
}

#List of covariates and labels
covariates <-  c( "age",  "woman",  "bourse",  "undergrad1",  "labormarket1",  "fatigue_level",  "Frm",  "sante_student",  "art_language", "law_econ",
  "sci_tech",  "social_science", "having_financial_struggles", "solve_extra_expenses", "own_salary", "low_proba_career", "low_proba_success", "pessimistic",
  "importance_finish_parents", "importance_grade_parents", "migrant",  "living_alone",  "father_university",  "mother_university", "both_parents_work", "depression",
  "anxiety", "selfcov", "closecov", "personal_trau_covid", "family_lost_job", "positive_vaccine", "lockdown_alone", "see_friends", "at_uni", "concen1", "concen2","lc_ind")

order<- c( "age","woman","bourse", "undergrad1",  "labormarket1",  "fatigue_level",  "Frm",  "sante_student",  "art_language",  "law_econ",  "sci_tech",  "social_science",
  "having_financial_struggles",  "solve_extra_expenses",  "own_salary",  "low_proba_career",  "low_proba_success",  "pessimistic",  "importance_finish_parents",  "importance_grade_parents",
  "migrant",  "living_alone",  "father_university",  "mother_university",  "both_parents_work",  "depression",  "anxiety",  "selfcov",  "closecov",  "personal_trau_covid",  "family_lost_job",
  "positive_vaccine",  "lockdown_alone",  "see_friends",  "at_uni",  "concen1",  "concen2",  "lc_ind")

covariate_names <- c(  "Age",  "Woman",  "Scholarship",  "1st year student",  "Close to labor market",  "Fatigued",  "First round matrices",  "Health Sciences",  "Arts and Languages",  "Law, Economics, Management",  "Science and Technology",   "Humanities and Social Science",  "Having financial struggles",
  "Can afford extra expenses",  "Having own salary",  "Low prob. success career",  "Low prob. success studies",  "Pessimistic about the next 5 years",  "Pressure to have diploma",  "Pressure to have good grades",
  "Migrant",  "Living alone",  "Father university degree",  "Mother university degree",  "Both parents work",  "Depression",  "Anxiety",  "Had Covid-19",  "Family member had Covid-19",  "Personal traumatic Covid experience",  "Family member lost job",  "Positive attitude tw vaccination",
  "Lock-down alone",  "Seeing friends",  "Going to the university",  "Cognitive rigidity",  "Cognitive control",  "Locus of control")


        ############## Outcome: Cognitive performance ################
# Outcome: 
outcome <- "Srm"


        ############ Treatment: Econ & threshold treatment ###########
treatment <- "treat_econ"

# Data
data_het <- subset(dt.het, treat_social==0 & threshold==1)
data <- data_het %>% drop_na(treat_econ)

n <- nrow(data)


# Prepare dataset
fmla <- formula(paste0("~ 0 + ", paste0(covariates, collapse="+")))
options(na.action='na.pass')
X <- model.matrix(fmla, data)
W <- data[,treatment]
Y <- data[,outcome]

# Number of rankings that the predictions will be ranking on 
num.rankings <- 4

# Prepare for data.splitting
# Assign a fold number to each observation.
# The argument 'clusters' in the next step will mimick K-fold cross-fitting.
num.folds <- 5

      # Causal Forest: start the loop
seed  <- 123
nb_rep <- 100
rankings <- matrix(NA, n, ncol=nb_rep)
tau_hat_all <- matrix(NA, n, ncol=nb_rep)

for (loop in 1:nb_rep) {
  folds <- sort(seq(n) %% num.folds) + 1
  seed <- seed + 1
  set.seed(seed)
    # reshuffle
  folds <- sample(folds)

  # Randomized settings with fixed and known probabilities (here: 0.33).
  forest <- causal_forest(X, Y, W, W.hat=.33, clusters = folds, num.trees=20000, seed=(123))
  
  # Retrieve out-of-bag predictions.
  tau.hat <- predict(forest)$predictions
  
  # Rank observations *within each fold* into top and bottom half according to their CATE prediction 
  ranking <- rep(NA, n)
  for (fold in seq(num.folds)) {
    tau.hat.quantiles <- quantile(tau.hat[folds == fold], probs = seq(0, 1, by=1/num.rankings))
    ranking[folds == fold] <- cut(tau.hat[folds == fold], tau.hat.quantiles, include.lowest=TRUE, labels=seq(num.rankings))
  }
  
  rankings[,loop] <- ranking
  tau_hat_all[,loop]<-tau.hat 
}


# Averaging over the loops
ranking_final <- rowMeans(rankings)
tau_hat_final <- rowMeans(tau_hat_all)

# Adding to data and defining final rank
data <- data %>% cbind(ranking_final, tau_hat_final) %>%  mutate(ranking_quar = cut_number(ranking_final, 4, label = FALSE))
data <- data %>% mutate(neg_CATE = ifelse(ranking_quar ==1, 1, ifelse(ranking_quar ==4, 0, NA)))


# Balance table
## Check if different groups have different average covariates levels across rankings.
table <- data %>% filter(neg_CATE==1 |neg_CATE==0 )
# table <- table %>% dplyr::select(c(paste0(covariates), tau_hat_final, neg_CATE, Srm)) %>% balance_table( "neg_CATE")
table <- table %>% dplyr::select(c(paste0(covariates), neg_CATE)) %>% balance_table( "neg_CATE")
table  <-table %>% rename( Variables = variables1, 
                           CATE_above_median= Media_control1,
                           CATE_below_median = Media_trat1) 
table  <-table %>% mutate(Diff= CATE_above_median- CATE_below_median)
table  <-table %>% data.frame()

pval<- as.matrix(table$p_value1)
p <- apply(pval, 1,  function(x) sapply(x, star))
p_stars <- paste(sprintf("%.2f", pval), p, sep="")

table1<- cbind.data.frame(table, p_stars )
table1<-  subset(table1, select = -c(p_value1))
table1<- table1 %>% slice(match(order, Variables))

table1<-  subset(table1, select = -c(Variables))
row.names(table1) <-  covariate_names


print(xtable(table1,  digits = 3), hline.after = NULL,
      include.colnames=FALSE,include.rownames=TRUE, only.contents = TRUE, type = "latex", file = 
        file.path(table_path, "CF_balance_labor_threshold.tex"))




























